Environment Variables and Packages

options(java.parameters = "-Xmx2048m",
        stringsAsFactors = FALSE, 
        encoding = 'UTF-8')

suppressPackageStartupMessages({
  # ISLR2
  library(ISLR2)
  # Splines
  library(splines)
  # DM
  library(zip)
  library(openxlsx)
  library(readxl)
  library(writexl)
  library(RcppRoll)
  library(plyr)
  library(stringi)
  library(feather)
  library(RODBC)
  library(MASS)
  library(car)
  library(caret)
  library(data.table)
  library(lubridate)
  library(plotly)
  library(pROC)
  library(tidymodels)
  library(tidyverse)
})

Boston

Select particular variables of Boston data.

boston.subset <- Boston %>% 
  select(dis, nox) %>% 
  arrange(dis)

head(boston.subset)
##      dis   nox
## 1 1.1296 0.668
## 2 1.1370 0.668
## 3 1.1691 0.631
## 4 1.1742 0.668
## 5 1.1781 0.659
## 6 1.2024 0.631

Cubic polynomial regression.

Fit.

boston.poly <- lm(nox ~ poly(dis, degree = 3), data = boston.subset)
summary(boston.poly)
## 
## Call:
## lm(formula = nox ~ poly(dis, degree = 3), data = boston.subset)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.121130 -0.040619 -0.009738  0.023385  0.194904 
## 
## Coefficients:
##                         Estimate Std. Error t value Pr(>|t|)    
## (Intercept)             0.554695   0.002759 201.021  < 2e-16 ***
## poly(dis, degree = 3)1 -2.003096   0.062071 -32.271  < 2e-16 ***
## poly(dis, degree = 3)2  0.856330   0.062071  13.796  < 2e-16 ***
## poly(dis, degree = 3)3 -0.318049   0.062071  -5.124 4.27e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.06207 on 502 degrees of freedom
## Multiple R-squared:  0.7148, Adjusted R-squared:  0.7131 
## F-statistic: 419.3 on 3 and 502 DF,  p-value: < 2.2e-16

Plot.

boston.poly.pred <- boston.subset %>% 
  mutate(nox_pred = boston.poly$fitted.values)

boston.poly.plot <- plot_ly(data = boston.poly.pred, x = ~dis) %>% 
  add_trace(y = ~nox, name = 'True', type = 'scatter', mode = 'markers') %>% 
  add_trace(y = ~nox_pred, name = 'Fit', type = 'scatter', mode = 'lines') %>% 
  layout(
    title = 'Polynomial Fits'
  )
boston.poly.plot

Polynomial fits of different degrees

Fit.

boston.poly.list <- list()
for (i in 1:10) {
  boston.poly.list[[i]] <- lm(nox ~ poly(dis, degree = i), data = boston.subset)
}

Plot.

pred.name <- paste0('nox_pred', str_pad(1:10, 2, side = 'left', pad = 0))

boston.poly.list.pred <- boston.subset
for (i in 1:10) {
  boston.poly.list.pred[pred.name[i]] <- boston.poly.list[[i]]$fitted.values
}

boston.poly.list.plot <- plot_ly(data = boston.poly.list.pred, x = ~dis) %>% 
  add_trace(y = ~nox, name = 'nox', type = 'scatter', mode = 'markers') %>% 
    add_trace(y = ~get(pred.name[1]), name = pred.name[1], 
              type = 'scatter', mode = 'lines') %>% 
    add_trace(y = ~get(pred.name[2]), name = pred.name[2], 
              type = 'scatter', mode = 'lines') %>% 
    add_trace(y = ~get(pred.name[3]), name = pred.name[3], 
              type = 'scatter', mode = 'lines') %>% 
    add_trace(y = ~get(pred.name[4]), name = pred.name[4], 
              type = 'scatter', mode = 'lines') %>% 
    add_trace(y = ~get(pred.name[5]), name = pred.name[5], 
              type = 'scatter', mode = 'lines') %>% 
    add_trace(y = ~get(pred.name[6]), name = pred.name[6], 
              type = 'scatter', mode = 'lines') %>% 
    add_trace(y = ~get(pred.name[7]), name = pred.name[7], 
              type = 'scatter', mode = 'lines') %>% 
    add_trace(y = ~get(pred.name[8]), name = pred.name[8], 
              type = 'scatter', mode = 'lines') %>% 
    add_trace(y = ~get(pred.name[9]), name = pred.name[9], 
              type = 'scatter', mode = 'lines') %>% 
    add_trace(y = ~get(pred.name[10]), name = pred.name[10], 
              type = 'scatter', mode = 'lines') %>% 
  layout(
    title = 'Polynomial Fits of Different Degrees'
  )
boston.poly.list.plot

Residual sum of squares.

boston.poly.list.res <- boston.subset
for (i in 1:10) {
  boston.poly.list.res[paste0('nox_res', str_pad(i, 2, side = 'left', pad = 0))] <- boston.poly.list[[i]]$residuals
}

boston.poly.list.rss <- boston.poly.list.res %>% 
  select(starts_with('nox_')) %>% 
  summarise_all(function(x) {sum(x^2)}) %>% 
  as.data.table() %>% 
  melt(variable.name = 'poly', value.name = 'rss')
boston.poly.list.rss
##          poly      rss
##  1: nox_res01 2.768563
##  2: nox_res02 2.035262
##  3: nox_res03 1.934107
##  4: nox_res04 1.932981
##  5: nox_res05 1.915290
##  6: nox_res06 1.878257
##  7: nox_res07 1.849484
##  8: nox_res08 1.835630
##  9: nox_res09 1.833331
## 10: nox_res10 1.832171

Cross-validation of polynomial

Split data set to 10-fold.

set.seed(0803)
folds.index <- createFolds(1:nrow(boston.subset), k = 10)
boston.poly.rss.degrees <- c()
for (i in 1:10) {
  rss.folds <- c()
  
  for (j in 1:10) {
    cv.train <- boston.subset %>% 
      filter(!(row_number() %in% folds.index[[j]]))
    
    cv.valid <- boston.subset %>% 
      filter(row_number() %in% folds.index[[j]])
    
    poly.cv <- lm(nox ~ poly(dis, degree = i), data = cv.train)
    
    rss <- sum((cv.valid$nox - predict(poly.cv, newdata = cv.valid['dis'])) ^ 2)
    rss.folds <- append(rss.folds, rss)
  }
  
  boston.poly.rss.degrees <- append(boston.poly.rss.degrees, sum(rss.folds))
}

boston.poly.cv.rss <- data.frame(poly = pred.name, 
                                 rss = boston.poly.rss.degrees)
boston.poly.cv.rss
##          poly      rss
## 1  nox_pred01 2.788914
## 2  nox_pred02 2.060030
## 3  nox_pred03 1.959390
## 4  nox_pred04 1.972514
## 5  nox_pred05 2.131732
## 6  nox_pred06 2.621185
## 7  nox_pred07 5.269533
## 8  nox_pred08 2.893226
## 9  nox_pred09 3.863362
## 10 nox_pred10 2.733447

When degree = 3, the polynomial regression has the minimal CV RSS, so the optimal degree is 3.

Regression spline

Regression spline with 4 degrees of freedom.

(boston.sp.knots <- attr(ns(boston.subset$dis, df = 4), 'knots'))
##      25%      50%      75% 
## 2.100175 3.207450 5.188425
boston.sp <- lm(nox ~ bs(dis, df = 4, knots = boston.sp.knots, degree = 3), data = boston.subset)
summary(boston.sp)
## 
## Call:
## lm(formula = nox ~ bs(dis, df = 4, knots = boston.sp.knots, degree = 3), 
##     data = boston.subset)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.128538 -0.037813 -0.009987  0.022644  0.195494 
## 
## Coefficients:
##                                                       Estimate Std. Error
## (Intercept)                                            0.65622    0.02370
## bs(dis, df = 4, knots = boston.sp.knots, degree = 3)1  0.10222    0.03516
## bs(dis, df = 4, knots = boston.sp.knots, degree = 3)2 -0.02963    0.02338
## bs(dis, df = 4, knots = boston.sp.knots, degree = 3)3 -0.15959    0.02791
## bs(dis, df = 4, knots = boston.sp.knots, degree = 3)4 -0.22815    0.03324
## bs(dis, df = 4, knots = boston.sp.knots, degree = 3)5 -0.26272    0.04930
## bs(dis, df = 4, knots = boston.sp.knots, degree = 3)6 -0.24002    0.05434
##                                                       t value Pr(>|t|)    
## (Intercept)                                            27.689  < 2e-16 ***
## bs(dis, df = 4, knots = boston.sp.knots, degree = 3)1   2.907  0.00381 ** 
## bs(dis, df = 4, knots = boston.sp.knots, degree = 3)2  -1.267  0.20571    
## bs(dis, df = 4, knots = boston.sp.knots, degree = 3)3  -5.718 1.86e-08 ***
## bs(dis, df = 4, knots = boston.sp.knots, degree = 3)4  -6.864 1.99e-11 ***
## bs(dis, df = 4, knots = boston.sp.knots, degree = 3)5  -5.329 1.50e-07 ***
## bs(dis, df = 4, knots = boston.sp.knots, degree = 3)6  -4.417 1.23e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.06062 on 499 degrees of freedom
## Multiple R-squared:  0.7295, Adjusted R-squared:  0.7263 
## F-statistic: 224.3 on 6 and 499 DF,  p-value: < 2.2e-16

Plot.

boston.sp.pred <- boston.subset %>% 
  mutate(nox_pred = boston.sp$fitted.values)

boston.sp.plot <- plot_ly(data = boston.sp.pred, x = ~dis) %>% 
  add_trace(y = ~nox, name = 'True', type = 'scatter', mode = 'markers') %>% 
  add_trace(y = ~nox_pred, name = 'Fit', type = 'scatter', mode = 'lines') %>% 
  layout(
    title = 'Spline Fits'
  )
boston.sp.plot

Regression spline of different degress of freedom

Fit.

boston.sp.knots.list <- list()
boston.sp.list <- list()
for (i in 1:10) {
  boston.sp.knots.list[[i]] <- attr(ns(boston.subset$dis, df = i), 'knots')
  boston.sp.list[[i]] <- lm(nox ~ bs(dis, df = i, knots = boston.sp.knots.list[[i]], degree = 3), data = boston.subset)
}

Plot.

pred.name <- paste0('nox_pred', str_pad(1:10, 2, side = 'left', pad = 0))

boston.sp.list.pred <- boston.subset
for (i in 1:10) {
  boston.sp.list.pred[pred.name[i]] <- boston.sp.list[[i]]$fitted.values
}

boston.sp.list.plot <- plot_ly(data = boston.sp.list.pred, x = ~dis) %>% 
  add_trace(y = ~nox, name = 'nox', type = 'scatter', mode = 'markers') %>% 
    add_trace(y = ~get(pred.name[1]), name = pred.name[1], 
              type = 'scatter', mode = 'lines') %>% 
    add_trace(y = ~get(pred.name[2]), name = pred.name[2], 
              type = 'scatter', mode = 'lines') %>% 
    add_trace(y = ~get(pred.name[3]), name = pred.name[3], 
              type = 'scatter', mode = 'lines') %>% 
    add_trace(y = ~get(pred.name[4]), name = pred.name[4], 
              type = 'scatter', mode = 'lines') %>% 
    add_trace(y = ~get(pred.name[5]), name = pred.name[5], 
              type = 'scatter', mode = 'lines') %>% 
    add_trace(y = ~get(pred.name[6]), name = pred.name[6], 
              type = 'scatter', mode = 'lines') %>% 
    add_trace(y = ~get(pred.name[7]), name = pred.name[7], 
              type = 'scatter', mode = 'lines') %>% 
    add_trace(y = ~get(pred.name[8]), name = pred.name[8], 
              type = 'scatter', mode = 'lines') %>% 
    add_trace(y = ~get(pred.name[9]), name = pred.name[9], 
              type = 'scatter', mode = 'lines') %>% 
    add_trace(y = ~get(pred.name[10]), name = pred.name[10], 
              type = 'scatter', mode = 'lines') %>% 
  layout(
    title = 'Spline Fits of Different Degrees of Freedom'
  )
boston.sp.list.plot

Residual sum of squares.

boston.sp.list.res <- boston.subset
for (i in 1:10) {
  boston.sp.list.res[paste0('nox_res', str_pad(i, 2, side = 'left', pad = 0))] <- boston.sp.list[[i]]$residuals
}

boston.sp.list.rss <- boston.sp.list.res %>% 
  select(starts_with('nox_')) %>% 
  summarise_all(function(x) {sum(x^2)}) %>% 
  as.data.table() %>% 
  melt(variable.name = 'sp', value.name = 'rss')
boston.sp.list.rss
##            sp      rss
##  1: nox_res01 1.934107
##  2: nox_res02 1.922775
##  3: nox_res03 1.840173
##  4: nox_res04 1.833966
##  5: nox_res05 1.829884
##  6: nox_res06 1.816995
##  7: nox_res07 1.825653
##  8: nox_res08 1.792535
##  9: nox_res09 1.796992
## 10: nox_res10 1.788999

Cross-validation of spline

Split data set to 10-fold.

set.seed(0803)
folds.index <- createFolds(1:nrow(boston.subset), k = 10)
boston.sp.rss.degrees <- c()
for (i in 1:10) {
  rss.folds <- c()
  
  for (j in 1:10) {
    cv.train <- boston.subset %>% 
      filter(!(row_number() %in% folds.index[[j]]))
    
    cv.valid <- boston.subset %>% 
      filter(row_number() %in% folds.index[[j]])
    
    knots <- attr(ns(cv.train$dis, df = i), 'knots')
    sp.cv <- lm(nox ~ bs(dis, df = i, knots = knots, degree = 3), data = cv.train)
    
    rss <- sum((cv.valid$nox - predict(sp.cv, newdata = cv.valid['dis'])) ^ 2)
    rss.folds <- append(rss.folds, rss)
  }
  
  boston.sp.rss.degrees <- append(boston.sp.rss.degrees, sum(rss.folds))
}

boston.sp.cv.rss <- data.frame(spline = pred.name, 
                               rss = boston.sp.rss.degrees)
boston.sp.cv.rss
##        spline      rss
## 1  nox_pred01 1.959390
## 2  nox_pred02 1.974652
## 3  nox_pred03 1.874106
## 4  nox_pred04 1.875422
## 5  nox_pred05 1.874337
## 6  nox_pred06 1.864625
## 7  nox_pred07 1.876487
## 8  nox_pred08 1.863335
## 9  nox_pred09 1.874418
## 10 nox_pred10 1.865125

When degree of freedom is 8, the regression spline has the minimal CV RSS, so the optimal degree of freedom is 8.